In [2]:
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline
import random
from numpy.random import rand
from copy import copy
from __future__ import division
import time
def read_text_words(filename, wordsnumber):
with open(filename) as f:
X = f.readlines()
X = X[:wordsnumber]
X = ''.join(X)
X = X.replace('\n', '')
return X
def read_text_filesize(filename, size):
with open(filename) as f:
X = f.read(int(round(1024*1024*size)))
X = X.replace('\n', '')
return X
def get_unicount(text):
length = len(text)
counts = np.zeros(26)
for i in xrange(length):
c = ord(text[i])
counts[c-97]+=1
#97-122
return counts
def get_bigram_stats_dic(text):
length = len(text)
dic = {}
for i in 'abcdefghijklmnopqrstuvwxyz':
for j in 'abcdefghijklmnopqrstuvwxyz':
dic[(i,j)]=0
for i in xrange(length-1):
if (text[i], text[i+1]) in dic:
dic[(text[i], text[i+1])] += 1
for k,v in dic.items():
dic[k] = v/(counts[ord(k[0])-97])
return dic
def quality(decrypted, original):
l = len(decrypted)
zipped = zip(decrypted, original)
return sum(1.0 for x,y in zipped if x == y)/l
def crypt(text):
p = range(26)
random.shuffle(p)
output=''
for ch in text:
try:
x = ord(ch) - ord('a')
output+=(chr(p[x] + ord('a')))
except:
pass
return output, p
def get_desiredPDF_bigram(permutation):
logp = 0
for i in xrange(len(encrypted)-1):
pr = stats[chr(permutation[ord(encrypted[i])-97]+97),
chr(permutation[ord(encrypted[i+1])-97]+97)]
if pr>0:
logp += math.log(pr)
else:
logp += -9 #penalty for non existant pairs
return logp
def metropolis( desiredPDF, initValue, computableRVS, skipIterations = 1000):
random_variable = initValue
random_variableDensityValue = desiredPDF( random_variable )
for i in xrange( skipIterations ):
candidate = computableRVS( random_variable )
candidateDensityValue = desiredPDF( candidate )
acceptanceProb = min(0, candidateDensityValue - random_variableDensityValue )
if math.log(random.random()) < acceptanceProb:
random_variable = candidate
random_variableDensityValue = candidateDensityValue
while True:
candidate = computableRVS( random_variable )
candidateDensityValue = desiredPDF( candidate )
acceptanceProb = min( 0, candidateDensityValue - random_variableDensityValue )
if math.log(random.random()) < acceptanceProb:
random_variable = candidate
random_variableDensityValue = candidateDensityValue
yield random_variable, random_variableDensityValue
def uniform( n ):
#initialize permutation with identical
permutation = [ i for i in xrange( n ) ]
#swap ith object with random onject from i to n - 1 enclusively
for i in xrange( n ):
j = random.randint( i, n - 1 )
permutation[ i ], permutation[ j ] = permutation[ j ], permutation[ i ]
return permutation
def applyTransposition( basePermutation ):
n = len( basePermutation )
permutation = copy( basePermutation )
#apply n random transpositions (including identical) to base permutation
# for i in xrange( n ):
k, l = random.randint( 0, n - 1 ), random.randint( 0, n - 1 )
permutation[ k ], permutation[ l ] = permutation[ l ], permutation[ k ]
return permutation
def decrypt(permutation, encrypted):
decrypted = []
for i in encrypted:
decrypted.append(chr(permutation[ord(i)-97]+97))
return ''.join(decrypted)
fixed test text:
In [6]:
fname = 'main/oliver_twist.txt'
original = read_text_words(fname, 5000)[3:]
encrypted,p = crypt(original)
print encrypted[:20]
vary size of train text (used War and Peace, total 3Mb)
In [10]:
from sys import stdout
sizes = [0.25,0.5,0.75,1]
qs = []
times = 4
for k in xrange(times):
for s in sizes:
print 'Start at', time.strftime('%H:%M:%S', time.localtime())
stdout.flush()
train_text = read_text_filesize('main/war_and_peace.txt', s)
counts = get_unicount(train_text)
stats = get_bigram_stats_dic(train_text)
init_p = uniform(26)
#Metropolis here
st = time.time()
computableGen = lambda t: applyTransposition(t)
metropolisgenerator = \
metropolis(get_desiredPDF_bigram, init_p, computableGen, 1500)
print 'Sampling...'
stdout.flush()
y = -float("inf")
for i in xrange( 1000 ): #<=========
cand = metropolisgenerator.next()
if (cand[1] > y):
y = cand[1]
x = cand[0]
et = time.time() - st
print 'metropolis time: ', et
print 'best density among 1000 last iterations: ', y
print 'corresponding permutation: ', x
decrypted = decrypt(x, encrypted)
qs.append( quality(decrypted, original))
print 'End at ', time.strftime('%H:%M:%S', time.localtime())
time.sleep(1.0)
plt.plot(sizes[:len(qs)], qs[:len(sizes)],
sizes[:len(qs)], qs[len(sizes):2*len(sizes)],
sizes[:len(qs)], qs[2*len(sizes):3*len(sizes)],
sizes[:len(qs)], qs[3*len(sizes):4*len(sizes)],)
Out[10]:
In [29]:
#qualities_1 = np.array(qs).reshape((4, 4))
print qualities_1
I = np.array([0, 1, 2, 3]) # take into account 'explosions'
means_1 = np.mean(qualities_1[I, :], 0)
stds_1 = np.std(qualities_1[I, :], 0)
sizes = [0.25, 0.5, 0.75, 1.0]
plt.title('Figure t3b2. Dependence quality on train text size.')
plt.xlabel('size, MB')
plt.ylabel('quality')
plt.plot(sizes, means_1 + stds_1, 'r:+')
plt.plot(sizes, means_1 - stds_1, 'r:+')
plt.plot(sizes, means_1, 'b-x')
plt.savefig('task-3-bigram-1.png')
for small train text quality graphs prove intuition that the small sized training does not give much information.
In [17]:
sizes = [2, 3]
qs = []
times = 4
for k in xrange(times):
print '%d-th series running...' % k
for j, s in enumerate(sizes):
print j, 'Start at', time.strftime('%H:%M:%S', time.localtime())
stdout.flush()
train_text = read_text_filesize('main/war_and_peace.txt', s)
counts = get_unicount(train_text)
stats = get_bigram_stats_dic(train_text)
init_p = uniform(26)
#Metropolis here
st = time.time()
computableGen = lambda t: applyTransposition(t)
metropolisgenerator = \
metropolis(get_desiredPDF_bigram, init_p, computableGen, 1500)
y = -float("inf")
for i in xrange( 1000 ): #<=========
cand = metropolisgenerator.next()
if (cand[1] > y):
y = cand[1]
x = cand[0]
et = time.time() - st
print 'metropolis time: ', et
print 'best density among 1000 last iterations: ', y
print 'corresponding permutation: ', x
decrypted = decrypt(x, encrypted)
qs.append( quality(decrypted, original))
print 'End at ', time.strftime('%H:%M:%S', time.localtime())
time.sleep(1.0)
qualities_2 = np.array(qs)
plt.plot(sizes[:len(qs)], qs[:len(sizes)],
sizes[:len(qs)], qs[len(sizes):2*len(sizes)],
sizes[:len(qs)], qs[2*len(sizes):3*len(sizes)],
sizes[:len(qs)], qs[3*len(sizes):4*len(sizes)],)
Out[17]:
In [31]:
qualities_2 = np.array(qs).reshape((4, 2))
print qualities_2
means_2 = np.mean(qualities_2, 0)
stds_2 = np.std(qualities_2, 0)
sizes = [2.0, 3.0]
plt.title('Figure t3b2. Dependence quality on train text size.')
plt.xlabel('size, MB')
plt.ylabel('quality')
plt.plot(sizes, means_2 + stds_2, 'r:+')
plt.plot(sizes, means_2 - stds_2, 'r:+')
plt.plot(sizes, means_2, 'b-x')
plt.savefig('task-3-bigram-2.png')
In [32]:
means = np.zeros(6)
means[0:4] = means_1
means[4:6] = means_2
stds = np.zeros(6)
stds[0:4] = stds_1
stds[4:6] = stds_2
full_sizes = [0.25, 0.5, 0.75, 1.0, 2.0, 3.0]
plt.title('Figure t3b_full. Dependence quality on train text size.')
plt.xlabel('size, MB')
plt.ylabel('quality')
plt.plot(full_sizes, means + stds, 'r:+')
plt.plot(full_sizes, means - stds, 'r:+')
plt.plot(full_sizes, means, 'b-x')
plt.savefig('task-3-bigram.png')
this may be caused by too little col iterations, let's raise it up to 2500:
In [53]:
sizes = [0.5, 1.75, 3]
qs = []
times = 3
for k in xrange(times):
for s in sizes:
train_text = read_text_filesize('main/war_and_peace.txt', s)
counts = get_unicount(train_text)
stats = get_bigram_stats_dic(train_text)
init_p = uniform(26)
#Metropolis here
st = time.time()
computableGen = lambda t: applyTransposition(t)
metropolisgenerator = \
metropolis(get_desiredPDF_bigram, init_p, computableGen, 2500)
y = -float("inf")
for i in xrange( 500 ): #<=========
cand = metropolisgenerator.next()
if (cand[1] > y):
y = cand[1]
x = cand[0]
et = time.time() - st
print 'metropolis time: ', et
print 'best density among 500 last iterations: ', y
print 'corresponding permutation: ', x
decrypted = decrypt(x, encrypted)
qs.append( quality(decrypted, original))
plt.plot(sizes[:len(qs)], qs[:len(sizes)],
sizes[:len(qs)], qs[len(sizes):2*len(sizes)],
sizes[:len(qs)], qs[2*len(sizes):3*len(sizes)],
sizes[:len(qs)], qs[3*len(sizes):4*len(sizes)],)
In [54]:
plt.plot(sizes[:len(qs)], qs[:len(sizes)],
sizes[:len(qs)], qs[len(sizes):2*len(sizes)],
sizes[:len(qs)], qs[2*len(sizes):3*len(sizes)])
Out[54]:
In [52]:
bp = np.zeros(26, dtype=int)
for i in p:
bp[p[i]] = i
q = get_desiredPDF_bigram(bp)
print bp
print q
super.txt
In [8]:
supertext = read_text_filesize('main/super.txt', 0.25)
print supertext[:100]
print supertext[140000:140100]
shakespeare!